This document demonstrates the use of the bRF and LASSO-D3S functions for integrative GRN inference.

Those functions infer the regulatory pathways of Arabidopsis thaliana’s roots in response to nitrate (N) induction from Varala et al., 2018.

They use as inputs the expression profiles of N-responsive genes and TFBS information. Prior TFBS information was built by searching in the promoters of the N-responsive genes the PWM of the N-responsive regulators.

Data import

Import of the expression data and the N-responsive genes and regulators :

load('rdata/inference_input_N_response_varala.rdata')
genes <- input_data$grouped_genes; length(genes)
## [1] 1426
tfs <- input_data$grouped_regressors; length(tfs)
## [1] 201
counts <- input_data$counts; dim(counts)
## [1] 1426   45
load("rdata/pwm_occurrences_N_response_varala.rdata")
dim(pwm_occurrence)
## [1] 1426  201
ALPHAS <- seq(0,1, by = 0.1)

Lauching the permutations and the true data inferences 100 times

nCores = 45
mats <- list()
nrep <- 100
for(alpha in ALPHAS){ # exploring PWM integration strength
  for(rep in 1:nrep){ # exploring inherent variability
    
    mat_rf <- bRF_inference(counts, genes, tfs, nTrees = 2000,
                            alpha = alpha,
                            pwm_occurrence = pwm_occurrence,
                            nCores = nCores,
                            importance = "%IncMSE")
    
    mat_rf_perm <- bRF_inference(counts, genes, tfs, nTrees = 2000,
                            alpha = alpha, tf_expression_permutation = TRUE,
                            pwm_occurrence = pwm_occurrence,
                            nCores = nCores,
                            importance = "%IncMSE")
    
    mats[[paste0("bRF_", as.character(alpha),  '_trueData_', rep)]] <- mat_rf
    mats[[paste0("bRF_", as.character(alpha),  '_shuffled_', rep)]] <- mat_rf_perm
    
  }
}
save(mats, file = "results/100_permutations_bRF_importances.rdata")

Thresholding the networks

load( "results/100_permutations_bRF_importances.rdata")
edges <- list()
densities <- c(0.005, 0.01,0.05,0.075)

for(name in names(mats)){ # exploring importance threshold stringency
      for(density in densities){
        edges[[paste0(name, '_', density)]] <-
        bRF_network(mats[[name]], density = density, pwm_occurrence, genes, tfs)
      }
}

save(edges, file = "results/100_permutations_bRF_edges.rdata")

Thresholding the networks

load( "results/100_permutations_bRF_importances.rdata")
load("results/clusters_mse_bRF_100permutations.rdata")
positive_genes <- names(clusters_rf[clusters_rf==2])

edges <- list()
densities <- c(0.005, 0.01,0.05,0.075)

for(name in names(mats)){ # exploring importance threshold stringency
      for(density in densities){
        edges[[paste0(name, '_', density)]] <-
        bRF_network(mats[[name]][,positive_genes], density = density, pwm_occurrence, positive_genes, tfs)
      }
}

save(edges, file = "results/100_permutations_bRF_edges_positive_same_density.rdata")
color_palette = c("#C12131", "#EC5D2F", "#FE945C", "#FFC08E" )
prettyZero <- function(l){
    max.decimals = max(nchar(str_extract(l, "\\.[0-9]+")), na.rm = T)-1
    lnew = formatC(l, replace.zero = T, zero.print = "0",
        digits = max.decimals, format = "f", preserve.width=T)
    return(lnew)
}

settings <- c("method", "alpha", "dataset","rep", "density")

PWM support

load("results/100_permutations_bRF_edges.rdata")

# number of edges per network
nrows <-
  data.frame(alpha_rep = names(unlist(lapply(edges, FUN = nrow))),
             n_edges = unlist(lapply(edges, FUN = nrow)))

nrows[, settings] <-
  str_split_fixed(nrows$alpha_rep, '_', length(settings))

edges_num <- lapply(edges, function(df)
  df[sapply(df, is.numeric)])

pwm_support <-
  data.frame(alpha_rep = names(unlist(lapply(edges_num, FUN = nrow))),
             pwm = unlist(lapply(edges_num, FUN = colMeans)))
pwm_support[, settings] <-
  str_split_fixed(pwm_support$alpha_rep, '_', length(settings))

pwm_support %>%
  left_join(nrows, by = settings) %>%
  mutate(alpha = as.numeric(alpha),
         density_label = paste(density, ':', n_edges, 'edges')) %>%
  ggplot(aes(color = density_label, x = alpha, y = pwm)) +
  ggh4x::facet_nested_wrap(vars(method, density), ncol = 8, nest_line = T) +
  geom_point() +
  geom_smooth(aes(fill = density_label, linetype =dataset) , alpha = 0.1) +
  theme(
    strip.background = element_blank(),
    axis.title.x = element_text(size = 22),
    title = element_text(size = 16),
    strip.text = element_text(size = 16),
    legend.position = "top",
    legend.text = element_text(size = 15),
    axis.text = element_text(size = 12)
  ) +
  xlab(expression(alpha)) + ylab("Mean PWM score in selected edges") +
  ggtitle("Average PWM support of inferred edges") +
  guides(color = guide_legend(nrow = 2, byrow = TRUE),
         fill = guide_legend(nrow = 2, byrow = TRUE)) +
  ylab(expression(paste("mean(", pi[tr], ")"))) +
  scale_x_continuous(labels = prettyZero) +
  scale_color_manual(name = "Network density", values = color_palette) +
  scale_fill_manual(name = "Network density", values = color_palette)

topology of the all-genes networks

get_nTF <- function(net){
  return(mean(table(net$to)))
}

get_TF <- function(net){
  return(length(unique(net$from)))
}

get_nGenes <- function(net){
  return(length(unique(net$to)))
}

# subset <- edges[as.numeric(str_split_fixed(names(edges), '_', 5)[,4])<6]

topo <- data.frame(alpha_rep = names(edges),
                n_TFs = unlist(lapply(edges, FUN = get_nTF)),
                TFs = unlist(lapply(edges, FUN = get_TF)),
                n_genes = unlist(lapply(edges, FUN = get_nGenes)))

topo[, settings] <- str_split_fixed(topo$alpha_rep,'_', length(settings))

data_topo <- topo %>%  
  left_join(nrows, by = settings) %>%
  mutate(alpha = as.numeric(alpha)) %>%
  mutate(density_label = paste(density, ':', n_edges, 'edges')) %>%
  group_by(alpha, density, dataset) %>%
  mutate(mean_n_TFs = mean(n_TFs, na.rm=T),
            sd_n_TFs = sd(n_TFs, na.rm=T),
            mean_TFs = mean(TFs, na.rm=T),
            sd_TFs = sd(TFs, na.rm=T),
            mean_nGenes = mean(n_genes, na.rm=T),
            sd_nGenes = sd(n_genes, na.rm=T)) 

regs_per_genes <- data_topo %>%
  ggplot(aes(color = dataset, fill = dataset,x = alpha, y = n_TFs)) +
  ggh4x::facet_nested_wrap(vars(method, density), ncol = 8, nest_line = T) + geom_point(alpha = 0.2) +
  geom_smooth(se = F) +
  geom_ribbon(aes(ymin = mean_n_TFs - sd_n_TFs , 
                    ymax = mean_n_TFs + sd_n_TFs  ), 
                alpha = .4)+
  theme(
    strip.background = element_blank(),
    axis.title.x = element_text(size = 22),
    title = element_text(size = 16),
    strip.text = element_text(size = 16),
    legend.text = element_text(size = 15),
    axis.text = element_text(size = 12),
    legend.position = 'none'
  ) +
  scale_x_continuous(labels = prettyZero) +
  xlab(expression(alpha)) + ylab("Average number of regulator per gene") + 
                 ggtitle("Average number of regulator per gene")

regs_in_network <- data_topo %>%
  ggplot(aes(color = dataset, fill = dataset, x = alpha, y = TFs)) +
  ggh4x::facet_nested_wrap(vars(method, density), ncol = 8, nest_line = T) + geom_point(alpha = 0.2) +
  geom_ribbon(aes(ymin = mean_TFs - sd_TFs , 
                    ymax = mean_TFs + sd_TFs  ), 
                alpha = .4)+
  theme(
    strip.background = element_blank(),
    axis.title.x = element_text(size = 22),
    title = element_text(size = 16),
    strip.text = element_text(size = 16),
    legend.text = element_text(size = 15),
    axis.text = element_text(size = 12),
    legend.position = 'top'
  ) +
  scale_x_continuous(labels = prettyZero) +
  xlab(expression(alpha))+ ylab("Number of distinct regulators") + 
                 ggtitle("Number of distinct regulators") 


targets_in_network <- data_topo %>%
  ggplot(aes(color = dataset, fill = dataset,x = alpha, y = n_genes)) +
  ggh4x::facet_nested_wrap(vars(method, density), ncol = 8, nest_line = T) + 
  geom_point(alpha = 0.2) +
  geom_smooth(se = F) +
  geom_ribbon(aes(ymin = mean_nGenes - sd_nGenes , 
                    ymax = mean_nGenes + sd_nGenes  ), 
                alpha = .4)+
  theme(
    strip.background = element_blank(),
    axis.title.x = element_text(size = 22),
    title = element_text(size = 16),
    strip.text = element_text(size = 16),
    legend.text = element_text(size = 15),
    axis.text = element_text(size = 12),
    legend.position = 'top'
  ) +
  scale_x_continuous(labels = prettyZero) +
  xlab(expression(alpha))+ ylab("Number of distinct targets") + 
                 ggtitle("Number of distinct targets") 


figure <- targets_in_network+regs_in_network+regs_per_genes + plot_layout(guides = "collect") + plot_annotation(tag_levels = 'a');figure

PWM support of positive networks

load("results/100_permutations_bRF_edges_positive_same_density.rdata")
# number of edges per network
nrows <-
  data.frame(alpha_rep = names(unlist(lapply(edges, FUN = nrow))),
             n_edges = unlist(lapply(edges, FUN = nrow)))

nrows[, settings] <-
  str_split_fixed(nrows$alpha_rep, '_', length(settings))

edges_num <- lapply(edges, function(df)
  df[sapply(df, is.numeric)])

pwm_support <-
  data.frame(alpha_rep = names(unlist(lapply(edges_num, FUN = nrow))),
             pwm = unlist(lapply(edges_num, FUN = colMeans)))
pwm_support[, settings] <-
  str_split_fixed(pwm_support$alpha_rep, '_', length(settings))

pwm_support %>%
  left_join(nrows, by = settings) %>%
  mutate(alpha = as.numeric(alpha),
         density_label = paste(density, ':', n_edges, 'edges')) %>%
  ggplot(aes(color = density_label, x = alpha, y = pwm)) +
  ggh4x::facet_nested_wrap(vars(method, density), ncol = 8, nest_line = T) +
  geom_point() +
  geom_smooth(aes(fill = density_label, linetype =dataset) , alpha = 0.1) +
  theme(
    strip.background = element_blank(),
    axis.title.x = element_text(size = 22),
    title = element_text(size = 16),
    strip.text = element_text(size = 16),
    legend.position = "top",
    legend.text = element_text(size = 15),
    axis.text = element_text(size = 12)
  ) +
  xlab(expression(alpha)) + ylab("Mean PWM score in selected edges") +
  ggtitle("Average PWM support of inferred edges") +
  guides(color = guide_legend(nrow = 2, byrow = TRUE),
         fill = guide_legend(nrow = 2, byrow = TRUE)) +
  ylab(expression(paste("mean(", pi[tr], ")"))) +
  scale_x_continuous(labels = prettyZero)

topology of the positive-genes networks

get_nTF <- function(net){
  return(mean(table(net$to)))
}

get_TF <- function(net){
  return(length(unique(net$from)))
}

get_nGenes <- function(net){
  return(length(unique(net$to)))
}

# subset <- edges[as.numeric(str_split_fixed(names(edges), '_', 5)[,4])<6]

topo <- data.frame(alpha_rep = names(edges),
                n_TFs = unlist(lapply(edges, FUN = get_nTF)),
                TFs = unlist(lapply(edges, FUN = get_TF)),
                n_genes = unlist(lapply(edges, FUN = get_nGenes)))

topo[, settings] <- str_split_fixed(topo$alpha_rep,'_', length(settings))

data_topo <- topo %>%  
  left_join(nrows, by = settings) %>%
  mutate(alpha = as.numeric(alpha)) %>%
  mutate(density_label = paste(density, ':', n_edges, 'edges')) %>%
  group_by(alpha, density, dataset) %>%
  mutate(mean_n_TFs = mean(n_TFs, na.rm=T),
            sd_n_TFs = sd(n_TFs, na.rm=T),
            mean_TFs = mean(TFs, na.rm=T),
            sd_TFs = sd(TFs, na.rm=T),
            mean_nGenes = mean(n_genes, na.rm=T),
            sd_nGenes = sd(n_genes, na.rm=T)) 

regs_per_genes <- data_topo %>%
  ggplot(aes(color = dataset, fill = dataset,x = alpha, y = n_TFs)) +
  ggh4x::facet_nested_wrap(vars(method, density), ncol = 8, nest_line = T) + geom_point(alpha = 0.2) +
  geom_smooth(se = F) +
  geom_ribbon(aes(ymin = mean_n_TFs - sd_n_TFs , 
                    ymax = mean_n_TFs + sd_n_TFs  ), 
                alpha = .4)+
  theme(
    strip.background = element_blank(),
    axis.title.x = element_text(size = 22),
    title = element_text(size = 16),
    strip.text = element_text(size = 16),
    legend.text = element_text(size = 15),
    axis.text = element_text(size = 12),
    legend.position = 'none'
  ) +
  scale_x_continuous(labels = prettyZero) +
  xlab(expression(alpha)) + ylab("Average number of regulator per gene") + 
                 ggtitle("Average number of regulator per gene")

regs_in_network <- data_topo %>%
  ggplot(aes(color = dataset, fill = dataset, x = alpha, y = TFs)) +
  ggh4x::facet_nested_wrap(vars(method, density), ncol = 8, nest_line = T) + geom_point(alpha = 0.2) +
  geom_ribbon(aes(ymin = mean_TFs - sd_TFs , 
                    ymax = mean_TFs + sd_TFs  ), 
                alpha = .4)+
  theme(
    strip.background = element_blank(),
    axis.title.x = element_text(size = 22),
    title = element_text(size = 16),
    strip.text = element_text(size = 16),
    legend.text = element_text(size = 15),
    axis.text = element_text(size = 12),
    legend.position = 'top'
  ) +
  scale_x_continuous(labels = prettyZero) +
  xlab(expression(alpha))+ ylab("Number of distinct regulators") + 
                 ggtitle("Number of distinct regulators") 


targets_in_network <- data_topo %>%
  ggplot(aes(color = dataset, fill = dataset,x = alpha, y = n_genes)) +
  ggh4x::facet_nested_wrap(vars(method, density), ncol = 8, nest_line = T) + 
  geom_point(alpha = 0.2) +
  geom_smooth(se = F) +
  geom_ribbon(aes(ymin = mean_nGenes - sd_nGenes , 
                    ymax = mean_nGenes + sd_nGenes  ), 
                alpha = .4)+
  theme(
    strip.background = element_blank(),
    axis.title.x = element_text(size = 22),
    title = element_text(size = 16),
    strip.text = element_text(size = 16),
    legend.text = element_text(size = 15),
    axis.text = element_text(size = 12),
    legend.position = 'top'
  ) +
  scale_x_continuous(labels = prettyZero) +
  xlab(expression(alpha))+ ylab("Number of distinct targets") + 
                 ggtitle("Number of distinct targets") 


figure <- targets_in_network+regs_in_network+regs_per_genes + plot_layout(guides = "collect") + plot_annotation(tag_levels = 'a');figure

Computing validation metrics

val_conn <-
  evaluate_networks(
    edges,
    validation = c("TARGET", "CHIPSeq", "DAPSeq"),
    nCores = 15,
    input_genes = genes,
    input_tfs = tfs
  )

val_conn[, settings] <-
  str_split_fixed(val_conn$network_name, '_', length(settings))

save(val_conn, file = "results/100_permutations_bRF_validation.rdata")


val_chip <-
  evaluate_networks(
    edges,
    validation = c("CHIPSeq"),
    nCores = 15,
    input_genes = genes,
    input_tfs = tfs
  )

val_chip[, settings] <-
  str_split_fixed(val_chip$network_name, '_', length(settings))
save(val_chip, file = "results/100_permutations_bRF_validation_chip.rdata")

val_target <-
  evaluate_networks(
    edges,
    validation = c("TARGET"),
    nCores = 15,
    input_genes = genes,
    input_tfs = tfs
  )

val_target[, settings] <-
  str_split_fixed(val_target$network_name, '_', length(settings))
save(val_target, file = "results/100_permutations_bRF_validation_target.rdata")


val_dap <-
  evaluate_networks(
    edges,
    validation = c("DAPSeq"),
    nCores = 15,
    input_genes = genes,
    input_tfs = tfs
  )

val_dap[, settings] <-
  str_split_fixed(val_dap$network_name, '_', length(settings))
save(val_dap, file = "results/100_permutations_bRF_validation_dap.rdata")

Plotting validation

draw_precision_recall <- function(validation, data_type){
  data_val <- validation %>%
    group_by(alpha, dataset, density) %>%
    mutate(mean_precision = mean(precision, na.rm = T),
           sd_precision = sd(precision, na.rm = T),
           mean_recall = mean(recall, na.rm = T),
           sd_recall = sd(recall, na.rm = T)) %>%
  dplyr::select(-network_name) %>%
  left_join(nrows, by = settings) %>%
  mutate(alpha = as.numeric(alpha)) %>%
  mutate(density_label = paste(density, ':', n_edges, 'edges')) 

precision <- data_val %>%
    ggplot(aes(
      x = as.numeric(alpha),
      y = precision,
      color = dataset,
      fill = dataset
    ))+
    ggh4x::facet_nested_wrap(vars(method, density), ncol = 8, nest_line = T) + 
    geom_ribbon(aes(ymin = mean_precision - sd_precision , 
                    ymax = mean_precision + sd_precision  ), 
                alpha = .4)  +theme_pubr(legend = "top")+
    geom_point(alpha = 0.1) + geom_smooth(se=F)+xlab("alpha") +
  xlab(expression(alpha)) + ylab("Precision") +
  ggtitle(paste("Precision against", data_type)) +
  theme(
    strip.background = element_blank(),
    axis.title.x = element_text(size = 22),
    title = element_text(size = 16),
    strip.text = element_text(size = 16),
    legend.text = element_text(size = 15),
    axis.text = element_text(size = 12),
    legend.position = 'top'
  )

recall <- data_val%>%
    ggplot(aes(
      x = as.numeric(alpha),
      y = recall,
      color = dataset,
      fill = dataset
    ))+
    ggh4x::facet_nested_wrap(vars(method, density), ncol = 8, 
                             nest_line = T, scales = "free") + 
    geom_ribbon(aes(ymin = mean_recall - sd_recall , 
                    ymax = mean_recall + sd_recall  ), 
                alpha = .4)  +theme_pubr(legend = "none")+
    geom_point(alpha = 0.1) + geom_smooth(se=F)+xlab("alpha") +
  xlab(expression(alpha)) + ylab("Recall") +
  ggtitle(paste("Recall against", data_type)) +
  theme(
    strip.background = element_blank(),
    axis.title.x = element_text(size = 22),
    title = element_text(size = 16),
    strip.text = element_text(size = 16),
    legend.text = element_text(size = 15),
    axis.text = element_text(size = 12),
    legend.position = 'none'
  )
  return(precision/recall)
}

ConnecTF

load("results/100_permutations_bRF_validation.rdata")
draw_precision_recall(val_conn, "ConnecTF")

load(file = "results/100_permutations_bRF_validation_cluster_pos.rdata")
draw_precision_recall(val_conn_pos, "ConnecTF, positive genes")

CHIP Seq only

load("results/100_permutations_bRF_validation_chip.rdata")
draw_precision_recall(val_chip, "CHIP-Seq")

load(file = "results/100_permutations_bRF_validation_chip_cluster_pos.rdata")
draw_precision_recall(val_chip, "CHIP-Seq, positive genes")

TARGET only

load("results/100_permutations_bRF_validation_target.rdata")
draw_precision_recall(val_target, "TARGET")

load(file = "results/100_permutations_bRF_validation_target_cluster_pos.rdata")
draw_precision_recall(val_target, "TARGET, positive genes")

DAP Seq only

load("results/100_permutations_bRF_validation_dap.rdata")
draw_precision_recall(val_dap, "DAP-Seq")

load(file = "results/100_permutations_bRF_validation_dap_cluster_pos.rdata")
draw_precision_recall(val_dap, "DAP-Seq, positive genes")

Not all densities should be explored with this restriction of genes, as there are too few genes to grant sufficient interactions supported by a prior of 1. We keep only the two smallest densities.

#frees up RAM
# edges <- edges[!str_detect(names(edges), '_0.075|0.05')]
val_conn_pos <-
  evaluate_networks(
    edges,
    validation = c("TARGET", "CHIPSeq", "DAPSeq"),
    nCores = 10,
    input_genes = genes,
    input_tfs = tfs
  )


val_conn_pos[, settings] <-
  str_split_fixed(val_conn_pos$network_name, '_', length(settings))
save(val_conn_pos, file = "results/100_permutations_bRF_validation_cluster_pos.rdata")



val_chip <-
  evaluate_networks(
    edges,
    validation = c("CHIPSeq"),
    nCores = 10,
    input_genes = genes,
    input_tfs = tfs
  )

val_chip[, settings] <-
  str_split_fixed(val_chip$network_name, '_', length(settings))
save(val_chip, file = "results/100_permutations_bRF_validation_chip_cluster_pos.rdata")


val_target <-
  evaluate_networks(
    edges,
    validation = c("TARGET"),
    nCores = 10,
    input_genes = genes,
    input_tfs = tfs
  )

val_target[, settings] <-
  str_split_fixed(val_target$network_name, '_', length(settings))
save(val_target, file = "results/100_permutations_bRF_validation_target_cluster_pos.rdata")


val_dap <-
  evaluate_networks(
    edges,
    validation = c("DAPSeq"),
    nCores = 10,
    input_genes = genes,
    input_tfs = tfs
  )

val_dap[, settings] <-
  str_split_fixed(val_dap$network_name, '_', length(settings))
save(val_dap, file = "results/100_permutations_bRF_validation_dap_cluster_pos.rdata")